{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"..\")\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "W = np.arange(21).reshape(7, 3)\n",
    "print(W)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(W[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "idx = np.array([1, 0, 3, 0])\n",
    "print(W[idx])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Embedding:\n",
    "    def __init__(self, W):\n",
    "        self.params = [W]\n",
    "        self.grads = [np.zeros_like(W)]\n",
    "        self.idx = None\n",
    "    \n",
    "    def forward(self, idx):\n",
    "        W, = self.params\n",
    "        self.idx = idx\n",
    "        out = W[idx]\n",
    "        return out\n",
    "    \n",
    "    def backward(self, dout):\n",
    "        dW, = self.grads\n",
    "        dW[...] = 0\n",
    "        # for i, word_id in enumerate(self.idx):\n",
    "        #     dW[word_id] += dout[i]\n",
    "        np.add.at(dW, self.idx, dout)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class EmbeddingDot:\n",
    "    def __init__(self, W):\n",
    "        self.embed = Embedding(W)\n",
    "        self.params = self.embed.params\n",
    "        self.grads = self.embed.grads\n",
    "        self.cache = None\n",
    "    \n",
    "    def forward(self, h, idx):\n",
    "        target_W = self.embed.forward(idx)\n",
    "        out = np.sum(target_W * h, axis=1)\n",
    "        self.cache = (h, target_W)\n",
    "        return out\n",
    "    \n",
    "    def backward(self, dout):\n",
    "        h, target_W = self.cache\n",
    "        dout = dout.reshape(dout.shape[0], 1)\n",
    "        dtarget_W = dout * h\n",
    "        self.embed.backward(dtarget_W)\n",
    "        dh = dout * target_W\n",
    "        return dh"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "words = \"You say goodbye I hello .\".replace(\".\", \" .\").lower().split()\n",
    "sns.countplot(np.random.choice(words, size=1000000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.choice(words, size=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.choice(words, size=5, replace=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = [0.5, 0.1, 0.05, 0.2, 0.05, 0.1]\n",
    "sns.countplot(np.random.choice(words, p=p, size=1000000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = [0.7, 0.29, 0.01]\n",
    "new_p = np.power(p, 0.75)\n",
    "new_p /= np.sum(new_p)\n",
    "print(new_p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from negative_sampling_layer import UnigramSampler\n",
    "from negative_sampling_layer import SigmoidWithLoss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus = np.array([0, 1, 2, 3, 4, 1, 2, 3])\n",
    "power = 0.75\n",
    "sample_size = 2\n",
    "\n",
    "sampler = UnigramSampler(corpus, power, sample_size)\n",
    "target = np.array([1, 3, 0])\n",
    "negative_sample = sampler.get_negative_sample(target)\n",
    "print(negative_sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class NegativeSamplingLoss:\n",
    "    def __init__(self, W, corpus, power=0.75, sample_size=5):\n",
    "        self.sample_size = sample_size\n",
    "        self.sampler = UnigramSampler(corpus, power, sample_size)\n",
    "        self.loss_layers = [SigmoidWithLoss() for  _ in range(sample_size + 1)]\n",
    "        self.embed_dot_layers = [EmbeddingDot(W) for _ in range(sample_size + 1)]\n",
    "        \n",
    "        self.params, self.grads = [], []\n",
    "        for layer in self.embed_dot_layers:\n",
    "            self.params += layer.params\n",
    "            self.grads += layer.grads\n",
    "    \n",
    "    def forward(self, h, target):\n",
    "        batch_size = target.shape[0]\n",
    "        negative_sample = self.sampler.get_negative_sample(target)\n",
    "        \n",
    "        # positive forward\n",
    "        score = self.embed_dot_layers[0].forward(h, target)\n",
    "        correct_label = np.ones(batch_size, dtype=np.int32)\n",
    "        loss = self.loss_layers[0].forward(score, correct_label)\n",
    "        \n",
    "        # negative forward\n",
    "        negative_label = np.zeros(batch_size, dtype=np.int32)\n",
    "        for i in range(self.sample_size):\n",
    "            negative_target = negative_sample[:, i]\n",
    "            score = self.embed_dot_layers[1 + i].forward(h, negative_target)\n",
    "            loss += self.loss_layers[1 + i].forward(score, negative_label)\n",
    "        return loss\n",
    "    \n",
    "    def backward(self, dout=1):\n",
    "        dh = 0\n",
    "        for l0, l1 in zip(self.loss_layers, self.embed_dot_layers):\n",
    "            dscore = l0.backward(dout)\n",
    "            dh += l1.backward(dscore)\n",
    "        return dh"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from common.layers import Embedding\n",
    "from negative_sampling_layer import NegativeSamplingLoss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class CBOW:\n",
    "    def __init__(self, vocab_size, hidden_size, window_size, corpus):\n",
    "        V, H = vocab_size, hidden_size\n",
    "        \n",
    "        # initialize weight\n",
    "        W_in = 0.01 * np.random.randn(V, H).astype('f')\n",
    "        W_out = 0.01 * np.random.randn(V, H).astype('f')\n",
    "        \n",
    "        # create layers\n",
    "        self.in_layers = []\n",
    "        for i in range(2 * window_size):\n",
    "            layer = Embedding(W_in)\n",
    "            self.in_layers.append(layer)\n",
    "        self.ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=5)\n",
    "        \n",
    "        # gather all weights and gradients\n",
    "        layers = self.in_layers + [self.ns_loss]\n",
    "        self.params, self.grads = [], []\n",
    "        for layer in layers:\n",
    "            self.params += layer.params\n",
    "            self.grads += layer.grads\n",
    "        \n",
    "        # set word distributed representation to member variable\n",
    "        self.word_vecs = W_in\n",
    "    \n",
    "    def forward(self, contexts, target):\n",
    "        h = 0\n",
    "        for i, layer in enumerate(self.in_layers):\n",
    "            h += layer.forward(contexts[:, i])\n",
    "        h /= len(self.in_layers)\n",
    "        loss = self.ns_loss.forward(h, target)\n",
    "        return loss\n",
    "    \n",
    "    def backward(self, dout=1):\n",
    "        dout = self.ns_loss.backward(dout)\n",
    "        dout /= len(self.in_layers)\n",
    "        for layer in self.in_layers:\n",
    "            layer.backward(dout)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from common import config\n",
    "from common.trainer import Trainer\n",
    "from common.optimizer import Adam\n",
    "from cbow import CBOW\n",
    "from common.util import create_contexts_target, to_cpu, to_gpu\n",
    "from dataset import ptb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set hyper-parameters\n",
    "window_size = 5\n",
    "hidden_size = 100\n",
    "batch_size = 10\n",
    "max_epoch = 1\n",
    "\n",
    "# load data\n",
    "corpus, word_to_id, id_to_word = ptb.load_data(\"train\")\n",
    "vocab_size = len(word_to_id)\n",
    "\n",
    "contexts, target = create_contexts_target(corpus, window_size)\n",
    "if config.GPU:\n",
    "    contexts, target = to_gpu(contexts), to_gpu(target)\n",
    "\n",
    "# create model\n",
    "model = CBOW(vocab_size, hidden_size, window_size, corpus)\n",
    "optimizer = Adam()\n",
    "trainer = Trainer(model, optimizer)\n",
    "\n",
    "# fitting\n",
    "trainer.fit(contexts, target, max_epoch, batch_size)\n",
    "trainer.plot()\n",
    "\n",
    "# save data\n",
    "word_vecs = model.word_vecs\n",
    "if config.GPU:\n",
    "    word_vecs = to_cpu(word_vecs)\n",
    "params = {}\n",
    "params[\"word_vecs\"] = word_vecs.astype(np.float16)\n",
    "params[\"word_to_id\"] = word_to_id\n",
    "params[\"id_to_word\"] = id_to_word\n",
    "pkl_file = \"my_cbow_params.pkl\"\n",
    "with open(pkl_file, \"wb\") as f:\n",
    "    pickle.dump(params, f, -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from common.util import most_similar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pkl_file = \"my_cbow_params.pkl\"\n",
    "with open(pkl_file, \"rb\") as f:\n",
    "    params = pickle.load(f)\n",
    "    word_vecs = params[\"word_vecs\"]\n",
    "    word_to_id = params[\"word_to_id\"]\n",
    "    id_to_word = params[\"id_to_word\"]\n",
    "\n",
    "querys = [\"you\", \"year\", \"car\", \"toyota\"]\n",
    "for query in querys:\n",
    "    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from common.util import analogy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "analogy(\"king\", \"man\", \"queen\", word_to_id, id_to_word, word_vecs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "analogy(\"take\", \"took\", \"go\", word_to_id, id_to_word, word_vecs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "analogy(\"car\", \"cars\", \"child\", word_to_id, id_to_word, word_vecs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "analogy(\"good\", \"better\", \"bad\", word_to_id, id_to_word, word_vecs)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
